library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##inverse correlation matrix
truckDrivers<-read_csv("/Users/parinithakompala/Desktop/QBS124/ass3/truckR.data.csv")
## Rows: 20 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): truc.dr, revenue
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
n=nrow(truckDrivers)
truckDrivers$time=1:n
head(truckDrivers)
mat<-cor(truckDrivers) # cor matrix
mat
## truc.dr revenue time
## truc.dr 1.0000000 0.9423984 0.9539531
## revenue 0.9423984 1.0000000 0.9672527
## time 0.9539531 0.9672527 1.0000000
mat_inv<-solve(mat) #inverse
mat_inv
## truc.dr revenue time
## truc.dr 11.910612 -3.639377 -7.841967
## revenue -3.639377 16.634643 -12.618108
## time -7.841967 -12.618108 20.685768
#R=cor(LRT)
R=mat
iR=iiR=solve(R)
ns=3
for(i in 1:ns)
for(j in 1:ns)
iR[i,j]=-iiR[i,j]/sqrt(abs(iiR[i,i]*iiR[j,j]))
diag(iR)=rep(NA,ns)
image(1:ns,1:ns,iR,col=c("cyan","goldenrod1","yellow","pink2"),breaks=c(-.25,.5,.35,.5,.75),xlab="",ylab="",axes=F)
## Warning in image.default(1:ns, 1:ns, iR, col = c("cyan", "goldenrod1",
## "yellow", : unsorted 'breaks' will be sorted before use
axis(side=1,at=1:ns,labels=names(truckDrivers),font=2)
axis(side=2,at=1:ns,labels=names(truckDrivers),srt=90,font=2)
axis(side=3,at=1:ns,labels=names(truckDrivers),font=2)
axis(side=4,at=1:ns,labels=names(truckDrivers),font=2)
for(i in 1:ns)
text(rep(i,ns),1:ns,round(iR[i,],2),font=2)
##Correlation of residuals
##Alternatively, partial correlation coefficient can be computed as the correlation between residuals from regressions X on Z and Y on Z..
#x=TruckDrivers,Y=revenue,z=time
x=lm(truc.dr~time,data=truckDrivers)$residuals
x
## 1 2 3 4 5 6
## -0.63289671 0.01947453 0.55375202 -0.87420453 0.45667373 0.27397840
## 7 8 9 10 11 12
## 0.31223574 1.32639177 -1.02611231 1.44307738 -0.58610939 -0.98958281
## 13 14 15 16 17 18
## -0.59175918 0.36021523 0.24257212 -0.23416646 -0.89656382 -0.60882574
## 19 20
## 1.24669424 0.20515578
y=lm(revenue~time,data=truckDrivers)$residuals
y
## 1 2 3 4 5 6 7
## -1.3752059 -1.7803210 1.9879346 -6.9735547 0.1616912 -1.4847593 5.9439196
## 8 9 10 11 12 13 14
## 7.1234474 -2.1727327 -1.7477582 2.0889068 0.8319533 0.8767898 2.5612970
## 15 16 17 18 19 20
## -2.2831059 0.7394079 1.5393935 -3.1881438 -3.8018366 0.9526769
M=(cor(x,y))
#reff- https://towardsdatascience.com/keeping-an-eye-on-confounds-a-walk-through-for-calculating-a-partial-correlation-matrix-2ac6b831c5b6
partialCors <- list()
truckColNames <- colnames(truckDrivers)
for (i in 1:length(truckColNames)) {
y <- truckColNames[i]
covariatesAll <- truckColNames[!(truckColNames %in% y)]
crntPcor <- double()
for (j in 1:length(covariatesAll)) {
covarLeftOut <- covariatesAll[j]
covariatesCrnt <- covariatesAll[!(covariatesAll %in% covarLeftOut)]
rhs <- paste(covariatesCrnt, collapse = " + ")
lhs <- paste(y, "~")
frmla <- as.formula(paste(lhs, rhs))
mod1 <- lm(frmla, data = truckDrivers)
R1 <- resid(mod1)
lhs <- paste(covarLeftOut, "~")
frmla <- as.formula(paste(lhs, rhs))
mod2 <- lm(frmla, data = truckDrivers)
R2 <- resid(mod2)
crntPcor[j] <- cor(R1,R2)
}
partialCors[[i]] <- append(crntPcor, 1 , i-1)
}
partialCorMat <- matrix(unlist(partialCors),
ncol = length(truckColNames),
nrow = length(truckColNames),
dimnames = list(truckColNames, truckColNames ))
partialCorMat
## truc.dr revenue time
## truc.dr 1.0000000 0.2585552 0.4995997
## revenue 0.2585552 1.0000000 0.6802236
## time 0.4995997 0.6802236 1.0000000
iR<-partialCorMat
ns=3
diag(iR)=rep(NA,ns)
image(1:ns,1:ns,iR,col=c("cyan","goldenrod1","yellow","pink2"),breaks=c(-.25,.5,.35,.5,.75),xlab="",ylab="",axes=F)
## Warning in image.default(1:ns, 1:ns, iR, col = c("cyan", "goldenrod1",
## "yellow", : unsorted 'breaks' will be sorted before use
axis(side=1,at=1:ns,labels=names(truckDrivers),font=2)
axis(side=2,at=1:ns,labels=names(truckDrivers),srt=90,font=2)
axis(side=3,at=1:ns,labels=names(truckDrivers),font=2)
axis(side=4,at=1:ns,labels=names(truckDrivers),font=2)
for(i in 1:ns)
text(rep(i,ns),1:ns,round(iR[i,],2),font=2)
Answer- According to Example 1, we can see tha U is the comman factor that control the variation in X and Y ans also causes his correlation between X and Y .Z,Q,U are independent. The Variance of U is the strength of influenze of U over X and Y, here we can have var(U)=\(\sigma^2\), if \(\sigma^2\)=0 then X and Y will be independent, cor=0.So we can say cor(X,Y) id the fn of \(\sigma^2\).If \(\sigma^2=\infty\), cor=1, everything will be dominated by U.As we know covariance is linear fn. Correlation between X and Y is positive.We should exclude U to refine the correlation between X and Y.For this we need to consider “partial correlation”.As said,“Every time you see an expected high correlation it may be caused by the presence of a common factor that influences both variables”.In this case it is U. We need to regress x on other variables and regress Y on other variables, by this we exclude the influses by other variables and just can kind correlation between X and Y.So we can do this or do inverse correlation matrix to exclude the commen variable and get the corr between X and Y. In other words lets say The strength of a relationship between two variables is measured using partial correlation, which accounts for the effect of one or more other factors. For example, while controlling for weight and exercise (U), you would wish to investigate if there is a link between the amount of food consumed(X) and blood pressure (Y). Multiple variables can be controlled at the same time (called control variables or covariates). More than one or two control variables, on the other hand, are usually not suggested because the more control variables you have, the less reliable your test will be. There is one continuous independent variable (the x value) and one continuous dependent variable (the y value) in partial correlation.This is the same as when performing a standard correlation analysis. The dependent variable in the blood pressure example is “amount of food eaten,” while the independent variable is “blood pressure.” Weight and amount of exercise should also be continuous control variables.
skelet<-read.csv("/Users/parinithakompala/Desktop/QBS124/ass4/Goldman.csv")
head(skelet)
dim(skelet)
## [1] 1538 69
d=read.csv("/Users/parinithakompala/Desktop/QBS124/ass4/Goldman.csv",stringsAsFactors=F)
female=as.numeric(d[,3])
## Warning: NAs introduced by coercion
d=d[,18:ncol(d)]
#d = select(d, -McH.FHD, -AVG.FHD) #removing the columns
d <- subset (d, select = -McH.FHD)
d <- subset (d, select = -AVG.FHD)
nm=names(d)
nc=ncol(d);nr=nrow(d)
for(i in 1:nc)
if(sum(is.na(d[,i]))==nr) {alln=i;break}
d=d[,-alln]
nm=nm[-alln]
nc=ncol(d)
R=cor(d,use="pairwise.complete.obs")
n=nrow(R)
par(mfrow=c(1,1),mar=c(0,1,2,1))
cl=c("deepskyblue","lightblue","green","yellow","red")
image(1:n,1:n,breaks=c(-.75,-.5,0,.5,.75,1),ylim=c(-5,n+1),xlim=c(-2,n+.5),col=cl,ylab="",xlab="",R,axes=F)
text(1:n,rep(0.5,n),nm,adj=1,cex=.75,srt=45)
text(rep(.3,n),1:n,nm,adj=1,cex=.75)
for(i in 1:n)
for(j in 1:n)
text(i,j,round(R[i,j],2),cex=.5)
mtext(side=3,paste("Correlation heatmap of",n,"osteometric measurements taken from 1538 human skeletons"),cex=2)
br=c("-0.75 to -0.5","-0.5 to 0","0 to 0.5","0.5 to 0.75","0.75 to 1.0")
legend(11,-2,br,col=cl,pch=15,horiz=T,cex=1.25)
d=as.matrix(d)
M=cor(d, method = "pearson", use = "complete.obs")
corrplot(M, method = 'number')
par(mfrow=c(1,1),mar=c(0,1,2,1))
cl=c("cadetblue1","plum2","darkolivegreen1","darkslategray1","deeppink")
image(1:n,1:n,breaks=c(-.75,-.5,0,.5,.75,1),ylim=c(-5,n+1),xlim=c(-2,n+.5),col=cl,ylab="",xlab="",R,axes=F)
text(1:n,rep(0.5,n),nm,adj=1,cex=.75,srt=45)
text(rep(.3,n),1:n,nm,adj=1,cex=.75)
for(i in 1:n)
for(j in 1:n)
text(i,j,round(R[i,j],2),cex=.5)
mtext(side=3,paste("Correlation heatmap of",n,"osteometric measurements taken from 1538 human skeletons"),cex=2)
br=c("-0.75 to -0.5","-0.5 to 0","0 to 0.5","0.5 to 0.75","0.75 to 1.0")
legend(11,-2,br,col=cl,pch=15,horiz=T,cex=1.25)
# load package
library(pheatmap)
# d=read.csv("/Users/parinithakompala/Desktop/QBS124/ass4/Goldman.csv",stringsAsFactors=F)
# female=as.numeric(d[,3])
# d=d[,18:ncol(d)]
# #d = select(d, -McH.FHD, -AVG.FHD) #removing the columns
# d <- subset (d, select = -McH.FHD)
# d <- subset (d, select = -AVG.FHD)
# nm=names(d)
# nc=ncol(d);nr=nrow(d)
# for(i in 1:nc)
# if(sum(is.na(d[,i]))==nr) {alln=i;break}
# d=d[,-alln]
# nm=nm[-alln]
# nc=ncol(d)
# R=cor(d,use="pairwise.complete.obs")
# n=nrow(R)
#
# R=as.data.frame(R)
# names(R)=names(d)
# pheatmap(R)
# readline()
# R=solve(R)
# for(i in 1:ncol(R))
# for(j in 1:ncol(R))
# if(i>j)
# R[i,j]=R[j,i]=-R[i,j]/sqrt(R[i,i]*R[j,j])
# diag(R)=1
# pheatmap(R)